{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a98030af-fcd1-4d63-a36e-38ba053498fa",
   "metadata": {},
   "source": [
    "# A full business solution\n",
    "\n",
    "Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.\n",
    "\n",
    "We will be provided a company name and their primary website."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a572211-5fe3-4dd5-9870-849cfb75901f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import os\n",
    "import requests\n",
    "import json\n",
    "from typing import List, Dict\n",
    "from dotenv import load_dotenv\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from openai import OpenAI\n",
    "from urllib.parse import urljoin\n",
    "\n",
    "# Load environment variables from a .env file\n",
    "load_dotenv()\n",
    "\n",
    "# Define constants\n",
    "MODEL = 'gpt-4o-mini'  # Specify the OpenAI model to use\n",
    "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')  # Get API key from environment or use default\n",
    "\n",
    "# Initialize OpenAI client with the API key\n",
    "openai = OpenAI(api_key=OPENAI_API_KEY)\n",
    "\n",
    "class Website:\n",
    "    \"\"\"\n",
    "    A class to represent a website and its contents.\n",
    "    \"\"\"\n",
    "    def __init__(self, url: str):\n",
    "        \"\"\"\n",
    "        Initialize the Website object with a given URL.\n",
    "        \n",
    "        :param url: The URL of the website to scrape\n",
    "        \"\"\"\n",
    "        self.url = url\n",
    "        self.title, self.text, self.links = self._scrape_website()\n",
    "\n",
    "    def _scrape_website(self) -> tuple:\n",
    "        \"\"\"\n",
    "        Scrape the website content, extracting title, text, and links.\n",
    "        \n",
    "        :return: A tuple containing the title, text content, and links of the website\n",
    "        \"\"\"\n",
    "        response = requests.get(self.url)\n",
    "        soup = BeautifulSoup(response.content, 'html.parser')\n",
    "        \n",
    "        # Extract title\n",
    "        title = soup.title.string if soup.title else \"No title found\"\n",
    "        \n",
    "        # Extract text content\n",
    "        if soup.body:\n",
    "            for tag in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "                tag.decompose()  # Remove unwanted tags\n",
    "            text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
    "        else:\n",
    "            text = \"\"\n",
    "        \n",
    "        # Extract links\n",
    "        links = [link.get('href') for link in soup.find_all('a') if link.get('href')]\n",
    "        \n",
    "        return title, text, links\n",
    "\n",
    "    def get_contents(self) -> str:\n",
    "        \"\"\"\n",
    "        Get a formatted string of the website contents.\n",
    "        \n",
    "        :return: A string containing the website title and text content\n",
    "        \"\"\"\n",
    "        return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
    "\n",
    "class LinkAnalyzer:\n",
    "    \"\"\"\n",
    "    A class to analyze and categorize links from a website.\n",
    "    \"\"\"\n",
    "    # System prompt for the OpenAI model to categorize links\n",
    "    LINK_SYSTEM_PROMPT = \"\"\"\n",
    "    You are provided with a list of links found on a webpage. Your task is to first categorize each link into one of the following categories:\n",
    "    - about page\n",
    "    - careers page\n",
    "    - terms of service\n",
    "    - privacy policy\n",
    "    - contact page\n",
    "    - other (please specify).\n",
    "\n",
    "    Once the links are categorized, please choose which links are most relevant to include in a brochure about the company. \n",
    "    The brochure should only include links such as About pages, Careers pages, or Company Overview pages. Exclude any links related to Terms of Service, Privacy Policy, or email addresses.\n",
    "\n",
    "    Respond in the following JSON format:\n",
    "    {\n",
    "        \"categorized_links\": [\n",
    "            {\"category\": \"about page\", \"url\": \"https://full.url/about\"},\n",
    "            {\"category\": \"careers page\", \"url\": \"https://full.url/careers\"},\n",
    "            {\"category\": \"terms of service\", \"url\": \"https://full.url/terms\"},\n",
    "            {\"category\": \"privacy policy\", \"url\": \"https://full.url/privacy\"},\n",
    "            {\"category\": \"other\", \"specify\": \"contact page\", \"url\": \"https://full.url/contact\"}\n",
    "        ],\n",
    "        \"brochure_links\": [\n",
    "            {\"type\": \"about page\", \"url\": \"https://full.url/about\"},\n",
    "            {\"type\": \"careers page\", \"url\": \"https://full.url/careers\"}\n",
    "        ]\n",
    "    }\n",
    "\n",
    "    Please find the links below and proceed with the task:\n",
    "\n",
    "    Links (some may be relative links):\n",
    "    [INSERT LINK LIST HERE]\n",
    "    \"\"\"\n",
    "\n",
    "    @staticmethod\n",
    "    def get_links(website: Website) -> Dict:\n",
    "        \"\"\"\n",
    "        Analyze and categorize links from a given website.\n",
    "        \n",
    "        :param website: A Website object containing the links to analyze\n",
    "        :return: A dictionary containing categorized links and brochure-relevant links\n",
    "        \"\"\"\n",
    "        # Prepare the user prompt for the OpenAI model\n",
    "        user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
    "        user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
    "    Do not include Terms of Service, Privacy, email links.\\n\"\n",
    "        user_prompt += \"Links (some might be relative links):\\n\"\n",
    "        user_prompt += \"\\n\".join(website.links)\n",
    "\n",
    "        # Make an API call to OpenAI for link analysis\n",
    "        completion = openai.chat.completions.create(\n",
    "            model=MODEL,\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": LinkAnalyzer.LINK_SYSTEM_PROMPT},\n",
    "                {\"role\": \"user\", \"content\": user_prompt}\n",
    "            ],\n",
    "            response_format={\"type\": \"json_object\"}\n",
    "        )\n",
    "        return json.loads(completion.choices[0].message.content)\n",
    "\n",
    "class BrochureGenerator:\n",
    "    \"\"\"\n",
    "    A class to generate a company brochure based on website content.\n",
    "    \"\"\"\n",
    "    # System prompt for the OpenAI model to generate the brochure\n",
    "    SYSTEM_PROMPT = \"\"\"\n",
    "    You are an assistant that analyzes the contents of several relevant pages from a company website \n",
    "    and creates a brochure about the company for prospective customers, investors and recruits. Respond in markdown.\n",
    "    Include details of company culture, customers and careers/jobs if you have the information.\n",
    "    Structure the brochure to include specific sections as follows:\n",
    "    About Us\n",
    "    What we do\n",
    "    How We Do It\n",
    "    Where We Do It\n",
    "    Our People\n",
    "    Our Culture\n",
    "    Connect with Us.\n",
    "    Please provide two versions of the brochure, the first in English, the second in Spanish. The contents of the brochure are to be the same for both languages.\n",
    "    \"\"\"\n",
    "\n",
    "    @staticmethod\n",
    "    def get_all_details(url: str) -> str:\n",
    "        \"\"\"\n",
    "        Gather all relevant details from a company's website.\n",
    "        \n",
    "        :param url: The URL of the company's main page\n",
    "        :return: A string containing all relevant website content\n",
    "        \"\"\"\n",
    "        result = \"Landing page:\\n\"\n",
    "        website = Website(url)\n",
    "        result += website.get_contents()\n",
    "\n",
    "        # Analyze links and get brochure-relevant ones\n",
    "        links = LinkAnalyzer.get_links(website)\n",
    "        brochure_links = links.get('brochure_links', [])\n",
    "        print(\"Found Brochure links:\", brochure_links)\n",
    "\n",
    "        # Gather content from brochure-relevant pages\n",
    "        for link in brochure_links:\n",
    "            result += f\"\\n\\n{link['type']}:\\n\"\n",
    "            full_url = urljoin(url, link[\"url\"])\n",
    "            result += Website(full_url).get_contents()\n",
    "\n",
    "        return result\n",
    "\n",
    "    @staticmethod\n",
    "    def get_brochure_user_prompt(company_name: str, url: str) -> str:\n",
    "        \"\"\"\n",
    "        Generate a user prompt for the OpenAI model to create a brochure.\n",
    "        \n",
    "        :param company_name: The name of the company\n",
    "        :param url: The URL of the company's main page\n",
    "        :return: A string containing the user prompt for brochure generation\n",
    "        \"\"\"\n",
    "        user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
    "        user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
    "        user_prompt += BrochureGenerator.get_all_details(url)\n",
    "        return user_prompt[:20_000]  # Truncate if more than 20,000 characters\n",
    "\n",
    "    @staticmethod\n",
    "    def stream_brochure(company_name: str, url: str):\n",
    "        \"\"\"\n",
    "        Generate and stream a company brochure.\n",
    "        \n",
    "        :param company_name: The name of the company\n",
    "        :param url: The URL of the company's main page\n",
    "        \"\"\"\n",
    "        # Make a streaming API call to OpenAI for brochure generation\n",
    "        stream = openai.chat.completions.create(\n",
    "            model=MODEL,\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": BrochureGenerator.SYSTEM_PROMPT},\n",
    "                {\"role\": \"user\", \"content\": BrochureGenerator.get_brochure_user_prompt(company_name, url)}\n",
    "            ],\n",
    "            stream=True\n",
    "        )\n",
    "\n",
    "        # Display the generated brochure in real-time\n",
    "        response = \"\"\n",
    "        display_handle = display(Markdown(\"\"), display_id=True)\n",
    "        for chunk in stream:\n",
    "            response += chunk.choices[0].delta.content or ''\n",
    "            response = response.replace(\"```\", \"\").replace(\"markdown\", \"\")\n",
    "            update_display(Markdown(response), display_id=display_handle.display_id)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc4965cf-f704-4d40-8b7d-f8e50913f87c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main execution block\n",
    "if __name__ == \"__main__\":\n",
    "    # Generate a brochure\n",
    "    BrochureGenerator.stream_brochure(\"Edward Donner\", \"https://edwarddonner.com/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0acb1194-fe89-40e3-8c3b-a10483315d3f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
